# -*- coding: utf-8 -*-
# 豆瓣电影250爬虫demo

from bs4 import BeautifulSoup
import re
import urllib.request, urllib.error
import xlwt
import sqlite3


def main():
    baseUrl = "https://movie.douban.com/top250?start="
    # 爬取数据
    dataList = getDataList(baseUrl)
    # 保存数据
    savePath = "豆瓣电影250.xls"
    saveData(dataList, savePath)


# 保存数据
def saveData(dataList, savePath):
    print("saveing……")
    workBook = xlwt.Workbook(encoding="utf-8")
    workSheet = workBook.add_sheet("豆瓣电影250", cell_overwrite_ok=True)
    col = ("电影链接", "图片链接", "中文名", "外文名", "评分", "评价数", "概况", "相关信息")  # 列
    for i in range(0, 8):
        workSheet.write(0, i, col[i])  # 写入列名
    for i in range(0, len(dataList)):
        print("第%d条" % i)
        data = dataList[i]
        for j in range(0, 8):
            workSheet.write(i + 1, j, data[j])

    workBook.save(savePath)


# 获取影片详情规则
# 例如： <a href="https://movie.douban.com/subject/1292052/">  得到https://movie.douban.com/subject/1292052/
findLink = re.compile(r'<a href="(.*?)">')  # 电影链接匹配规则
# 图片的链接规则
findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S)  # re.S忽略换行符号
# 影片片名
findTitle = re.compile(r'<span class="title">(.*)</span>')
# 影片评分
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>')
# 评价人数
findJudge = re.compile(r'<span>(\d*)人评价</span>')
# 概况
findInq = re.compile(r'<span class="inq">(.*)</span>')
# 找到影片相关内容
findBd = re.compile(r'<p class="">(.*?)</p>', re.S)


# 爬取数据
def getDataList(baseUrl):
    baseList = []
    for i in range(0, 10):  # 共计250条数据范围
        url = baseUrl + str(i * 25)
        # 1.获取数据源
        html = askURl(url)
        # 2.解析数据
        soup = BeautifulSoup(html, "html.parser")
        for item in soup.find_all('div', class_="item"):  # 制定规则获取内容
            # print(item)  # 测试查看数据信息
            data = []  # 保存一部电影所有信息
            item = str(item)  # 转换成字符串
            link = re.findall(findLink, item)[0]  # 正则匹配
            data.append(link)
            img = re.findall(findImgSrc, item)[0]  # 正则匹配
            data.append(img)
            title = re.findall(findTitle, item)  # 正则匹配
            # 判断中英文名 多个情况
            if (len(title) == 2):
                data.append(title[0])  # 中文名
                data.append(title[1].replace("/", ""))  # 外文名
            else:
                data.append(title[0])
                data.append(' ')  # 留空，Excel处理必须占位
            rating = re.findall(findRating, item)[0]  # 正则匹配
            data.append(rating)
            judge = re.findall(findJudge, item)[0]  # 正则匹配
            data.append(judge)
            inq = re.findall(findInq, item)  # 会有空的情况
            if len(inq) != 0:
                inq[0].replace("。", " ")
                data.append(inq)
            else:
                data.append(" ")
            bd = re.findall(findBd, item)[0]  # 会有空的情况
            bd = re.sub('<br(\s+)?/>(\s+)?', " ", bd)  # 替换<br>
            bd = re.sub('/', " ", bd)
            data.append(bd.strip())  # 去掉前后空格

            baseList.append(data)  # 整理所有数据
    # print(baseList)
    return baseList


# 指定一个URL内容
def askURl(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
    }
    request = urllib.request.Request(url, headers=headers)

    try:
        response = urllib.request.urlopen(request)
        # print(response.read().decode("utf-8"))
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return response


# 测试调用程序
if __name__ == "__main__":
    # baseUrl = "https://movie.douban.com/top250?start=25&filter="
    # baseUrl = "https://movie.douban.com/top250?start="
    # askURl(url)
    # getDataList(baseUrl)
    main()
    print("爬取完毕")
